Theme Song



FRA : Elle me dit

CHN : 她对我说

ENG : She told me

JPN : 彼女は私に言う


FRA : écris une chanson contente

CHN : 写一首欢快的歌

ENG : Write a happy song

JPN : 幸せな歌を書く


FRA : Pas une chanson déprimante

CHN : 而不是悲伤的歌

ENG : Not a depressing song

JPN : 気のめいるような歌ではない


FRA : Une chanson que tout le monde aime

CHN : 一首让所有人都喜欢的歌

ENG : A song that everyone loves

JPN : みんなが大好きな曲


FRA : Elle me dit

CHN : 她对我说

ENG : She told me

JPN : 彼女は私に言う


FRA : Tu deviendras milliardaire

CHN : 你将成为亿万富翁

ENG : You will become a millionaire

JPN : あなたは億万長者になります


FRA : Tu auras de quoi être fier

CHN : 你将为此感到骄傲

ENG : You will be proud

JPN : あなたは誇りに思うでしょう



1 Introduction

Due to below issues from Deriv.com - Interday High Frequency Trading Models Comparison Blooper, here I review the research by using same dataset as we can know from below Part I.

In this paper I am using sarima models.

Load Packages

if(!suppressPackageStartupMessages(require('BBmisc'))) {
  install.packages('BBmisc', dependencies = TRUE, INSTALL_opts = '--no-lock')
}
suppressPackageStartupMessages(require('BBmisc'))
# suppressPackageStartupMessages(require('rmsfuns'))

pkgs <- c('devtools', 'knitr', 'kableExtra', 'tint', 'furrr', 'tidyr', 
          'devtools','readr', 'lubridate', 'data.table', 'reprex', 
          'feather', 'purrr', 'quantmod', 'tidyquant', 'tibbletime', 
          'timetk', 'plyr', 'dplyr', 'stringr', 'magrittr', 'tdplyr', 
          'tidyverse', 'memoise', 'htmltools', 'formattable', 'dtplyr', 
          'zoo', 'forecast', 'seasonal', 'seasonalview', 'rjson', 
          'rugarch', 'rmgarch', 'mfGARCH', 'sparklyr', 'jcolors', 
          'microbenchmark', 'dendextend', 'lhmetools', 'ggthemr', 
          'stringr', 'pacman', 'profmem', 'ggthemes', 'flyingfox', 
          'htmltools', 'echarts4r', 'viridis', 'hrbrthemes', 
          'fable', 'fabletools', 'Rfast', 'Metrics', 'MLmetrics')

suppressAll(lib(pkgs))
# load_pkg(pkgs)

.dtr <- 'C:/Users/User/Documents/GitHub/binary.com-interview-question-data/'

## Set the timezone but not change the datetime
Sys.setenv(TZ = 'Asia/Tokyo')
## options(knitr.table.format = 'html') will set all kableExtra tables to be 'html', otherwise need to set the parameter on every single table.
options(warn = -1, knitr.table.format = 'html')#, digits.secs = 6)

## https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-abnor-in-rmd-but-not-in-r-script
knitr::opts_chunk$set(cache = TRUE, warning = FALSE, 
                      message = FALSE, cache.lazy = FALSE)

rm(pkgs)

Progress Function

task_progress <- function(mbase, timeID0 = NULL, scs = 60, .pattern = '^sarima_', .loops = TRUE) {
  ## ------------- 定时查询进度 ----------------------
  ## 每分钟自动查询与更新以上模拟预测汇价进度(储存文件量)。
  require('magrittr')
  require('tibble')
  
  if(!is.data.frame(class(mbase))) { 
    mbase %<>% data.frame
  }
  
  if (.loops == TRUE) {
    while(1) {
      cat('Current Tokyo Time :', as.character(now('Asia/Tokyo')), '\n\n')
      
      y = as_date(mbase$index) %>% 
            unique
      y <- y[weekdays(y) != 'Saturday'] #filter and omit the weekly last price which is 12:00am on saturday
        datee = y
        
        if(is.null(timeID0)) { 
            timeID0 = y[1]
        } else if (is.Date(timeID0)) { 
            timeID0 = as_date(timeID0)
        } else {
            timeID0 = as_date(mbase$index) %>% 
            unique
        }
      
        y = y[y >= timeID0]
      
      x = list.files(paste0(.dtr, 'data/fx/USDJPY/'), pattern = .pattern) %>% 
          str_replace_all('.rds', '') %>% 
          str_replace_all('.201', '_201') %>% 
          str_split_fixed('_', '2') %>% 
          as_tibble %>% 
          dplyr::rename('Model' = 'V1', 'Date' = 'V2') %>% 
          dplyr::mutate(Model = factor(Model), Date = as_date(Date))
        
      x = join(tibble(Date = datee), x) %>% 
          as_tibble   
      x %<>% na.omit
      
      x %<>% dplyr::mutate(binary = if_else(is.na(Model), 0, 1)) %>% 
          spread(Model, binary)
      
      z <- ldply(x[,-1], function(zz) {
          na.omit(zz) %>% length }) %>% 
          dplyr::rename(x = V1) %>% 
          dplyr::mutate(n = length(y), progress = percent(x/n))
      
      print(z)
      
      prg = sum(z$x)/sum(z$n)
      cat('\n================', as.character(percent(prg)), '================\n\n')
      
      if (prg == 1) break #倘若进度达到100%就停止更新。
      
      Sys.sleep(scs) #以上ldply()耗时3~5秒,而休息时间60秒。
    }
  } else {
    
    cat('Current Tokyo Time :', as.character(now('Asia/Tokyo')), '\n\n')
      
    
      y = as_date(mbase$index) %>% 
            unique
      datee = y
        
      if(is.null(timeID0)) { 
          timeID0 = y[1]
      } else if (is.Date(timeID0)) { 
          timeID0 = as_date(timeID0)
      } else {
          timeID0 = as_date(mbase$index) %>% 
          unique
      }
    
      y = y[y >= timeID0]
    
      x = list.files(paste0(.dtr, 'data/fx/USDJPY/'), pattern = .pattern) %>% 
          str_replace_all('.rds', '') %>% 
          str_replace_all('.201', '_201') %>% 
          str_split_fixed('_', '2') %>% 
          as_tibble %>% 
          dplyr::rename('Model' = 'V1', 'Date' = 'V2') %>% 
          dplyr::mutate(Model = factor(Model), Date = as_date(Date))
        
      x = join(tibble(Date = datee), x) %>% 
          as_tibble
      x %<>% na.omit
      
      x %<>% dplyr::mutate(binary = if_else(is.na(Model), 0, 1)) %>% 
          spread(Model, binary)
        
      z <- ldply(x[,-1], function(zz) {
          na.omit(zz) %>% length }) %>% 
          dplyr::rename(x = V1) %>% 
          dplyr::mutate(n = length(y), progress = percent(x/n))
                
    print(z)
    
    prg = sum(z$x)/sum(z$n)
    cat('\n================', as.character(percent(prg)), '================\n\n')
    }
  }

2 Data

2.1 Read Data

## check if data path set
if(!exists('dtr')) {
  dtr <- 'C:/Users/User/Documents/GitHub/binary.com-interview-question-data/'}

## save files if not exists
if(!file.exists(paste0(dtr, 'data/fx/USDJPY/dsmp.rds')) & exists('dsmp')) {
  saveRDS(dsmp, paste0(dtr, 'data/fx/USDJPY/dsmp.rds'))}

## read files if not exists
if(!exists('dsmp')) {
  dsmp <- readRDS(paste0(dtr, 'data/fx/USDJPY/dsmp.rds'))}
## plot sample data
dsmp[c(1:3, (nrow(dsmp)-3):nrow(dsmp)),] %>% 
  kbl(caption = '1 min Close Price Dataset', escape = FALSE) %>% 
  ## https://www.w3schools.com/cssref/css_colors.asp
  ## https://public.tableau.com/en-us/gallery/100-color-palettes?gallery=votd
  row_spec(0, background = 'DimGrey', color = 'gold', bold = TRUE) %>% 
  column_spec(1, background = 'CornflowerBlue') %>% 
  column_spec(2, background = 'Gray') %>% 
  column_spec(3, background = 'DarkGrey') %>% 
  column_spec(4, background = 'Gray') %>% 
  column_spec(5, background = 'DarkGrey') %>% 
  column_spec(6, background = '#4897D8') %>% 
  column_spec(7, background = '#556DAC') %>% 
  column_spec(8, background = '#92AAC7') %>% 
  column_spec(9, background = '#556DAC') %>% 
  column_spec(10, background = '#375E97') %>% 
  column_spec(11, background = 'CornflowerBlue') %>% 
  column_spec(12, background = 'LightGray', color = 'goldenrod') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>% 
  kable_material(full_width = FALSE) %>% ##`full_width = FALSE` will auto adjust every single columns width to fit the table full width.
  scroll_box(width = '100%', fixed_thead = TRUE, height = '400px')
1 min Close Price Dataset
index year quarter month week wkdays wk_1m dy_1m hr_1m sq date close
2015-01-05 00:01:00 2015 1 1 1 Monday 1 1 1 1 2015-01-05 120.5740
2015-01-05 00:02:00 2015 1 1 1 Monday 2 2 2 2 2015-01-05 120.5900
2015-01-05 00:03:00 2015 1 1 1 Monday 3 3 3 3 2015-01-05 120.6035
2018-07-06 23:57:00 2018 3 7 27 Friday 7197 1437 57 1324797 2018-07-06 110.4635
2018-07-06 23:58:00 2018 3 7 27 Friday 7198 1438 58 1324798 2018-07-06 110.4740
2018-07-06 23:59:00 2018 3 7 27 Friday 7199 1439 59 1324799 2018-07-06 110.4740
2018-07-07 00:00:00 2018 3 7 27 Saturday 7200 1440 60 1324800 2018-07-07 110.4740

source : 1324800 x 12

From above table, we can know the dataset gather from 2015-01-05 to 2018-07-07 which is used in Deriv.com - Interday High Frequency Trading Models Comparison Review (Part I).

3 Modelling

Here I start my 1st forecast date from 1st trading datetime of 2016 (2016-01-04 which is 2nd year in dataset) which is used in Deriv.com - Interday High Frequency Trading Models Comparison Review (Part I).

3.1 Sarima ts() & auto.arima()

3.1.1 Seasonal Data Modeling

I set the length of data as weekly (7200 minutes which is 5 trading days) to forecast 1440 minutes (1440 minutes is a trading day).

Here I only use 7200 to forecast 1440 as we can know that’s the optimal volume of observation data as we know from Deriv.com - Interday High Frequency Trading Models Comparison Review (Part I).

3.1.2 Wk >> Dy

I set the length of data as weekly (5 days * 1440 mins = 7200 mins minutes which is 5 trading days) to forecast 1440 minutes (1440 minutes is a trading day).

sarima <- function(timeID, data = dsmp, data_len, hrz1 = 1440, 
                   .model = c('auto', 'Arima'), .d = NA, .D = NA, 
                   .order = c(0, 0, 0), .seasonal = c(0, 0, 0)) {
  
  tmp <- llply(1:length(timeID), function(i) {
    if(i == 1) {
      
      cat('\n')
      cat('===========================================\n')
      cat('train[', i, ']\n')
      print(train <- dsmp[date < timeID[i]][(.N - (data_len - 1)):.N])
      ctr <- train$sq[1]:(range(train$sq)[2] + hrz1)
      
      cat('\n')
      cat('-------------------------------------------\n')
      cat('train_test[', i, ']\n')
      
      print(train_test <- dsmp[sq %in% ctr])
      
      srm <- train[, .(index, close)] %>% 
        tk_ts(frequency = hrz1)
      
      if(.model == 'auto') {
        srm %<>% auto.arima(d = .d, D = .D)
        
      } else if (.model == 'Arima') {
        srm %<>% Arima(order = .order, seasonal = .seasonal)
      }
      
      srm %<>% 
        forecast(h = hrz1) %>% 
        tk_tbl %>% 
        dplyr::mutate(index = train_test[(.N - hrz1 + 1):.N,]$index, 
                      mk.price = train_test[(.N - hrz1 + 1):.N,]$close) %>% 
        dplyr::rename(fc.price = `Point Forecast`) %>% 
        dplyr::select(index, mk.price, fc.price)
      
      cat('\n')
      cat('-------------------------------------------\n')
      cat('forecast[', i, ']\n')
      
      print(srm %>% as.data.table)
      
      saveRDS(srm, paste0(
        dtr, 'data/fx/USDJPY/sarima_', .model, '_', data_len, 
        '_', hrz1, '.', as_date(srm$index[1]), '.rds'))
      
      cat('\n')
      cat(i, '=', paste0('~/data/fx/USDJPY/sarima_', .model, '_', 
                         data_len, '_', hrz1, '.', 
                         as_date(srm$index[1]), '.rds saved!\n'))
      cat('\n\n')
      rm(sets)
      
    } else if(i %in% seq(1, length(timeID), by = 6)[-1]) {
      
      
    } else if(i == length(timeID)) {
      
      
    } else  {
      
      lst_sq <- dsmp[date < timeID[i],][.N]$sq + 1
      
      cat('\n')
      cat('===========================================\n')
      cat('train[', i, ']\n')
      
      print(train <- dsmp[(lst_sq - data_len + 1):lst_sq])
      ctr <- train$sq[1]:(range(train$sq)[2] + hrz1)
      
      cat('\n')
      cat('-------------------------------------------\n')
      cat('train_test[', i, ']\n')
      
      print(train_test <- dsmp[sq %in% ctr])
      
      srm <- train[, .(index, close)] %>% 
        tk_ts(frequency = hrz1)
      
      if(.model == 'auto') {
        srm %<>% auto.arima(d = .d, D = .D)
        
      } else if (.model == 'Arima') {
        srm %<>% Arima(order = .order, seasonal = .seasonal)
      }
      
      srm %<>% 
        forecast(h = hrz1) %>% 
        tk_tbl %>% 
        dplyr::mutate(index = train_test[(.N - hrz1 + 1):.N,]$index, 
                      mk.price = train_test[(.N - hrz1 + 1):.N,]$close) %>% 
        dplyr::rename(fc.price = `Point Forecast`) %>% 
        dplyr::select(index, mk.price, fc.price)
      
      cat('\n')
      cat('-------------------------------------------\n')
      cat('forecast[', i, ']\n')
      
      print(srm %>% as.data.table)
      
      saveRDS(srm, paste0(
        dtr, 'data/fx/USDJPY/sarima_', .model, '_', data_len, 
        '_', hrz1, '.', as_date(srm$index[1]), '.rds'))
      
      cat('\n')
      cat(i, '=', paste0('~/data/fx/USDJPY/sarima_', .model, '_', 
                         data_len, '_', hrz1, '.', 
                         as_date(srm$index[1]), '.rds saved!\n'))
      cat('\n\n')
      rm(srm)
    }
  })
  return(tmp)
}
source('function/sarima.R')
# --------- eval=FALSE ---------
timeID <- unique(dsmp$date)
bse <- dsmp[year == 2016]$date[1] #"2016-01-04" #1st trading date in 2nd year
timeID %<>% .[. >= bse]
#timeID %<>% .[. >= as_date('2016-01-04')]
data_len <- 7200 #last 7200  observations dsmp[(.N - (data_len - 1)):.N]
hrz1 <- 1440

.model = c('auto', 'Arima')
.model = c('auto', 'Arima')
.d = NA
.D = NA 
.order = c(0, 0, 0)
.seasonal = c(0, 0, 0)

llply(.model, function(md) {
  sarima(timeID = timeID, dsmp, 
        data_len = data_len, hrz1 = hrz1, 
        .model = md, 
        .d = .d, .D = .D, 
        .order = c(0, 0, 0), 
        .seasonal = c(0, 0, 0))
  })

3.2 Sarima msts() & auto.arima()

3.5 msts() & hw()

After looking a bit more, I’ve came across this question where a user wanted to use the hw method to forecast half-hourly electricity demand using the taylor dataset available in the forecast package.

As Professor Rob Hyndman suggest in the response to the linked question, the double seasonal Holt Winters model method dshw from the forecast package can be used to deal with half-hourly data.

After removing the yearly seasonality parameter (seasonal.periods = 8760) in the definition of my msts object, I’ve ran the model and it provided a pretty accurate result.

How can I apply Seasonal Exponential Smoothing forecasting method to hourly data in R

3.6 msts() & tbats()

3.6.1 Seasonal Data Modeling

I set the length of data as weekly (7200 minutes which is 5 trading days) to forecast 1440 minutes (1440 minutes is a trading day).

Chapter 7 Multivariate TS Analysis in Introduction to Time Series Analysis and Forecasting in R, but I use univariate due to some errors as mentioned in beginning.

I tried to use weeks(1), months(1), months(3), years(1) but there is not constant observations, we can refer to The seasonal period.

Here I filter up the data as below :

  • 5 days * 1440 mins = 7200 mins = weekly
  • 22 days * 1440 mins = 31680 mins = monthly
  • 3 months * 22 days * 1440 mins = 95040 mins = quarterly
  • 52 weeks * 5 days * 1440 mins = 374400 mins = yearly

3.6.2 (Wk ∈ Mn) >> Dy

I set the length of data as monthly (22 days * 1440 mins = 31680 mins minutes which is 22 trading days), nested seasonal weekly (5 days * 1440 mins = 7200 mins minutes which is 5 trading days) and 1440 minutes (1440 minutes is a trading day) to forecast 1440 minutes (1440 minutes is a trading day).

  • data observation length 22 days * 1440 mins = 31680 mins
  • seasonal 5 days * 1440 mins = 7200 mins
  • sub-seasonal 1440 mins
  • forecast 1440 mins
# --------- eval=FALSE ---------
timeID <- unique(dsmp$date)
bse <- dsmp[year == 2016]$date[1] #"2016-01-04" #1st trading date in 2nd year
timeID %<>% .[. >= bse]
#timeID %<>% .[. >= as_date('2016-01-04')]
data_len <- 7200 #last 7200  observations dsmp[(.N - (data_len - 1)):.N]
hrz1 <- 1440
hrz2 <- 1440

llply(ets.m, function(md) {
  mstseas(timeID = timeID, dsmp, 
        data_len = data_len, hrz1 = hrz1, 
        hrz2 = hrz2, .model = md)
  })

3.6.3 (Wk ∈ Qt) >> Dy

I set the length of data as quarterly (3 months * 22 days * 1440 mins = 95040 mins minutes which is 66 trading days), nested seasonal weekly (5 days * 1440 mins = 7200 mins minutes which is 5 trading days) and 1440 minutes (1440 minutes is a trading day) to forecast 1440 minutes (1440 minutes is a trading day).

  • data observation length 3 months * 22 days * 1440 mins = 95040 mins
  • seasonal 5 days * 1440 mins = 7200 mins
  • sub-seasonal 1440 mins
  • forecast 1440 mins

3.6.4 (Wk ∈ Yr) >> Dy

I set the length of data as yearly (52 weeks * 5 days * 1440 mins = 374400 mins minutes which is 260 trading days), nested seasonal weekly (5 days * 1440 mins = 7200 mins minutes which is 5 trading days) and 1440 minutes (1440 minutes is a trading day) to forecast 1440 minutes (1440 minutes is a trading day).

  • data observation length 52 weeks * 5 days * 1440 mins = 374400 mins
  • seasonal 5 days * 1440 mins = 7200 mins
  • sub-seasonal 1440 mins
  • forecast 1440 mins

3.6.5 (Mn ∈ Qt) >> Dy

I set the length of data as quarterly (3 months * 22 days * 1440 mins = 95040 mins minutes which is 66 trading days), nested seasonal monthly (22 days * 1440 mins = 31680 mins minutes which is 22 trading days) and 1440 minutes (1440 minutes is a trading day) to forecast 1440 minutes (1440 minutes is a trading day).

  • data observation length 3 months * 22 days * 1440 mins = 95040 mins
  • seasonal 22 days * 1440 mins = 31680 mins
  • sub-seasonal 1440 mins
  • forecast 1440 mins

4 Comparison

4.1 Read Models

4.1.1 Grouped Models

Here I read the saved models.

Due to the models only forecast 1440 mins (but not 7200 mins) in advance, here I no need to filter the forecast price.

5 Conclusion

5.1 Summary

5.2 Final Conclude

From final stage models comparison, we know that ******* is the βest model.

5.3 Future Studies

Next papers will compare tbats, midas, sarimax etc.

6 Appendix

6.1 Blooper

6.2 Documenting File Creation

It’s useful to record some information about how your file was created.

  • File creation date: 2021-02-03
  • File latest updated date: 2021-02-03
  • R version 4.0.3 (2020-10-10)
  • R version (short form): 4.0.3
  • rmarkdown package version: 2.6
  • File version: 1.0.0
  • Author Profile: ®γσ, Eng Lian Hu
  • GitHub: Source Code
  • Additional session information:
suppressMessages(require('dplyr', quietly = TRUE))
suppressMessages(require('magrittr', quietly = TRUE))
suppressMessages(require('formattable', quietly = TRUE))
suppressMessages(require('knitr', quietly = TRUE))
suppressMessages(require('kableExtra', quietly = TRUE))

sys1 <- devtools::session_info()$platform %>% 
  unlist %>% data.frame(Category = names(.), session_info = .)
rownames(sys1) <- NULL

sys2 <- data.frame(Sys.info()) %>% 
  dplyr::mutate(Category = rownames(.)) %>% .[2:1]
names(sys2)[2] <- c('Sys.info')
rownames(sys2) <- NULL

if (nrow(sys1) == 9 & nrow(sys2) == 8) {
  sys2 %<>% rbind(., data.frame(
  Category = 'Current time', 
  Sys.info = paste(as.character(lubridate::now('Asia/Tokyo')), 'JST🗾')))
} else {
  sys1 %<>% rbind(., data.frame(
  Category = 'Current time', 
  session_info = paste(as.character(lubridate::now('Asia/Tokyo')), 'JST🗾')))
}

sys <- cbind(sys1, sys2) %>% 
  kbl(caption = 'Additional session information:') %>% 
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>% 
  row_spec(0, background = 'DimGrey', color = 'yellow') %>% 
  column_spec(1, background = 'CornflowerBlue', color = 'red') %>% 
  column_spec(2, background = 'grey', color = 'black') %>% 
  column_spec(3, background = 'CornflowerBlue', color = 'blue') %>% 
  column_spec(4, background = 'grey', color = 'white') %>% 
  row_spec(9, bold = T, color = 'yellow', background = '#D7261E')

rm(sys1, sys2)
sys
Additional session information:
Category session_info Category Sys.info
version R version 4.0.3 (2020-10-10) sysname Windows
os Windows 10 x64 release 10 x64
system x86_64, mingw32 version build 19042
ui RTerm nodename SCIBROKES-TRADI
language en machine x86-64
collate English_World.1252 login Owner
ctype English_World.1252 user Owner
tz Asia/Tokyo effective_user Owner
date 2021-02-03 Current time 2021-02-03 04:32:31 JST<U+0001F5FE>